//include/deal.II-translator/matrix_free/cuda_fe_evaluation_0.txt
// ---------------------------------------------------------------------
//
// Copyright (C) 2016 - 2020 by the deal.II authors
//
// This file is part of the deal.II library.
//
// The deal.II library is free software; you can use it, redistribute
// it, and/or modify it under the terms of the GNU Lesser General
// Public License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// The full text of the license can be found in the file LICENSE.md at
// the top level directory of deal.II.
//
// ---------------------------------------------------------------------

#ifndef dealii_cuda_fe_evaluation_h
#define dealii_cuda_fe_evaluation_h

#include <deal.II/base/config.h>

#ifdef DEAL_II_COMPILER_CUDA_AWARE

#  include <deal.II/base/tensor.h>
#  include <deal.II/base/utilities.h>

#  include <deal.II/lac/cuda_atomic.h>
#  include <deal.II/lac/cuda_vector.h>

#  include <deal.II/matrix_free/cuda_hanging_nodes_internal.h>
#  include <deal.II/matrix_free/cuda_matrix_free.h>
#  include <deal.II/matrix_free/cuda_matrix_free.templates.h>
#  include <deal.II/matrix_free/cuda_tensor_product_kernels.h>

DEAL_II_NAMESPACE_OPEN

/**
 * 用于CUDA包装器的命名空间
 *
 *
 */
namespace CUDAWrappers
{
  namespace internal
  {
    /**
     * 计算给定的线程ID、维度和每个空间维度中的点的数量的Dof/Quad索引。
     *
     */
    template <int dim, int n_points_1d>
    __device__ inline unsigned int
    compute_index()
    {
      return (dim == 1 ?
                threadIdx.x % n_points_1d :
                dim == 2 ?
                threadIdx.x % n_points_1d + n_points_1d * threadIdx.y :
                threadIdx.x % n_points_1d +
                    n_points_1d * (threadIdx.y + n_points_1d * threadIdx.z));
    }
  } // namespace internal

  /**
   * 该类提供了在正交点和单元格积分上评估函数所需的所有函数。在功能上，这个类类似于FEValues<dim>。
   * 该类有五个模板参数。      @tparam  dim 本类使用的维度
   * @tparam  fe_degree
   * 每一坐标方向具有fe_degree+1个自由度的张量指令有限元的度数
   * @tparam  n_q_points_1d
   * 一维正交公式中的点数，默认为fe_degree+1  @tparam
   * n_components
   * 解决PDEs系统时的矢量元件数量。如果同一个操作被应用于一个PDE的几个分量（例如，一个矢量拉普拉斯方程），它们可以通过一次调用同时应用（而且通常更有效率）。默认为1
   * @tparam  数字 数字格式， @p double  或  @p float.  默认为  @p
   * 双重。
   * @ingroup CUDAWrappers
   *
   */
  template <int dim,
            int fe_degree,
            int n_q_points_1d = fe_degree + 1,
            int n_components_ = 1,
            typename Number   = double>
  class FEEvaluation
  {
  public:
    /**
     * 标量的一个别名。
     *
     */
    using value_type = Number;

    /**
     * 矢量的别名。
     *
     */
    using gradient_type = Tensor<1, dim, Number>;

    /**
     * 内核特定信息的别名。
     *
     */
    using data_type = typename MatrixFree<dim, Number>::Data;

    /**
     * 尺寸。
     *
     */
    static constexpr unsigned int dimension = dim;

    /**
     * 组件的数量。
     *
     */
    static constexpr unsigned int n_components = n_components_;

    /**
     * 每个单元的正交点的数量。
     *
     */
    static constexpr unsigned int n_q_points =
      Utilities::pow(n_q_points_1d, dim);

    /**
     * 每个单元的张量自由度的数量。
     *
     */
    static constexpr unsigned int tensor_dofs_per_cell =
      Utilities::pow(fe_degree + 1, dim);

    /**
     * 构造函数。
     *
     */
    __device__
    FEEvaluation(const unsigned int       cell_id,
                 const data_type *        data,
                 SharedData<dim, Number> *shdata);

    /**
     * 对于向量 @p src,
     * 读出当前单元的自由度的值，并在内部存储。当没有约束条件时，与函数
     * DoFAccessor::get_interpolated_dof_values
     * 的功能类似，但它也包括来自悬挂节点的约束条件，所以一旦也可以把它看作是与
     * AffineConstraints::read_dof_valuess 类似的函数。
     *
     */
    __device__ void
    read_dof_values(const Number *src);

    /**
     * 取当前单元格的dof值内部存储的值，并将其加到向量中
     * @p dst.
     * 该函数也在写操作过程中应用约束。因此，其功能与函数
     * AffineConstraints::distribute_local_to_global. 相似。
     *
     */
    __device__ void
    distribute_local_to_global(Number *dst) const;

    /**
     * 在单元格上的正交点上评估输入矢量中的DoF值的函数值和FE函数的梯度。函数参数指定哪些部分应被实际计算。这个函数需要在函数
     * @p get_value() 或 @p get_gradient() 提供有用信息之前调用。
     *
     */
    __device__ void
    evaluate(const bool evaluate_val, const bool evaluate_grad);

    /**
     * 该函数获取存储在正交点上的值和/或梯度，通过单元上的所有基函数/梯度进行测试，并执行单元积分。两个函数参数
     * @p integrate_val 和 @p integrate_grad
     * 用于启用/禁用某些值或梯度。
     *
     */
    __device__ void
    integrate(const bool integrate_val, const bool integrate_grad);

    /**
     * 与上述相同，只是正交点是由线程id计算的。
     *
     */
    __device__ value_type
               get_value() const;

    /**
     * 同上，除了本地的dof指数是由线程id计算出来的。
     *
     */
    __device__ value_type
               get_dof_value() const;

    /**
     * 同上，除了正交点是由线程ID计算出来的。
     *
     */
    __device__ void
    submit_value(const value_type &val_in);

    /**
     * 同上，除了本地的dof指数是由线程id计算出来的。
     *
     */
    __device__ void
    submit_dof_value(const value_type &val_in);

    /**
     * 同上，除了正交点是由线程ID计算出来的。
     *
     */
    __device__ gradient_type
               get_gradient() const;

    /**
     * 同上，除了正交点是由线程ID计算出来的。
     *
     */
    __device__ void
    submit_gradient(const gradient_type &grad_in);

    // clang-format off
    /**
     * 同上，只是函数器 @p func
     * 只接受一个输入参数（fe_eval），并从线程id计算正交点。
     * @p func  需要定义代码 __device__ void operator()(
     * CUDAWrappers::FEEvaluation<dim,  fe_degree, n_q_points_1d,
     * n_components, Number>fe_eval) const; /endcode
     *
     */
    // clang-format on
    template <typename Functor>
    __device__ void
    apply_for_each_quad_point(const Functor &func);

  private:
    types::global_dof_index *local_to_global;
    unsigned int             n_cells;
    unsigned int             padding_length;
    const unsigned int       mf_object_id;

    const unsigned int constraint_mask;

    const bool use_coloring;

    Number *inv_jac;
    Number *JxW;

    // Internal buffer
    Number *values;
    Number *gradients[dim];
  };



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    FEEvaluation(const unsigned int       cell_id,
                 const data_type *        data,
                 SharedData<dim, Number> *shdata)
    : n_cells(data->n_cells)
    , padding_length(data->padding_length)
    , mf_object_id(data->id)
    , constraint_mask(data->constraint_mask[cell_id])
    , use_coloring(data->use_coloring)
    , values(shdata->values)
  {
    local_to_global = data->local_to_global + padding_length * cell_id;
    inv_jac         = data->inv_jacobian + padding_length * cell_id;
    JxW             = data->JxW + padding_length * cell_id;

    for (unsigned int i = 0; i < dim; ++i)
      gradients[i] = shdata->gradients[i];
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    read_dof_values(const Number *src)
  {
    static_assert(n_components_ == 1, "This function only supports FE with one \
                  components");
    const unsigned int idx = internal::compute_index<dim, n_q_points_1d>();

    const types::global_dof_index src_idx = local_to_global[idx];
    // Use the read-only data cache.
    values[idx] = __ldg(&src[src_idx]);

    __syncthreads();

    internal::resolve_hanging_nodes<dim, fe_degree, false>(constraint_mask,
                                                           values);
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    distribute_local_to_global(Number *dst) const
  {
    static_assert(n_components_ == 1, "This function only supports FE with one \
                  components");
    internal::resolve_hanging_nodes<dim, fe_degree, true>(constraint_mask,
                                                          values);

    const unsigned int idx = internal::compute_index<dim, n_q_points_1d>();

    const types::global_dof_index destination_idx = local_to_global[idx];

    if (use_coloring)
      dst[destination_idx] += values[idx];
    else
      atomicAdd(&dst[destination_idx], values[idx]);
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::evaluate(
    const bool evaluate_val,
    const bool evaluate_grad)
  {
    // First evaluate the gradients because it requires values that will be
    // changed if evaluate_val is true
    internal::EvaluatorTensorProduct<
      internal::EvaluatorVariant::evaluate_general,
      dim,
      fe_degree,
      n_q_points_1d,
      Number>
      evaluator_tensor_product(mf_object_id);
    if (evaluate_val == true && evaluate_grad == true)
      {
        evaluator_tensor_product.value_and_gradient_at_quad_pts(values,
                                                                gradients);
        __syncthreads();
      }
    else if (evaluate_grad == true)
      {
        evaluator_tensor_product.gradient_at_quad_pts(values, gradients);
        __syncthreads();
      }
    else if (evaluate_val == true)
      {
        evaluator_tensor_product.value_at_quad_pts(values);
        __syncthreads();
      }
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::integrate(
    const bool integrate_val,
    const bool integrate_grad)
  {
    internal::EvaluatorTensorProduct<
      internal::EvaluatorVariant::evaluate_general,
      dim,
      fe_degree,
      n_q_points_1d,
      Number>
      evaluator_tensor_product(mf_object_id);
    if (integrate_val == true && integrate_grad == true)
      {
        evaluator_tensor_product.integrate_value_and_gradient(values,
                                                              gradients);
      }
    else if (integrate_val == true)
      {
        evaluator_tensor_product.integrate_value(values);
        __syncthreads();
      }
    else if (integrate_grad == true)
      {
        evaluator_tensor_product.integrate_gradient<false>(values, gradients);
        __syncthreads();
      }
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ typename FEEvaluation<dim,
                                   fe_degree,
                                   n_q_points_1d,
                                   n_components_,
                                   Number>::value_type
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    get_value() const
  {
    const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
    return values[q_point];
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ typename FEEvaluation<dim,
                                   fe_degree,
                                   n_q_points_1d,
                                   n_components_,
                                   Number>::value_type
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    get_dof_value() const
  {
    const unsigned int dof = internal::compute_index<dim, fe_degree + 1>();
    return values[dof];
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    submit_value(const value_type &val_in)
  {
    const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
    values[q_point]            = val_in * JxW[q_point];
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    submit_dof_value(const value_type &val_in)
  {
    const unsigned int dof = internal::compute_index<dim, fe_degree + 1>();
    values[dof]            = val_in;
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ typename FEEvaluation<dim,
                                   fe_degree,
                                   n_q_points_1d,
                                   n_components_,
                                   Number>::gradient_type
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    get_gradient() const
  {
    static_assert(n_components_ == 1, "This function only supports FE with one \
                  components");

    // TODO optimize if the mesh is uniform
    const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
    const Number *     inv_jacobian = &inv_jac[q_point];
    gradient_type      grad;
    for (int d_1 = 0; d_1 < dim; ++d_1)
      {
        Number tmp = 0.;
        for (int d_2 = 0; d_2 < dim; ++d_2)
          tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] *
                 gradients[d_2][q_point];
        grad[d_1] = tmp;
      }

    return grad;
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    submit_gradient(const gradient_type &grad_in)
  {
    // TODO optimize if the mesh is uniform
    const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
    const Number *     inv_jacobian = &inv_jac[q_point];
    for (int d_1 = 0; d_1 < dim; ++d_1)
      {
        Number tmp = 0.;
        for (int d_2 = 0; d_2 < dim; ++d_2)
          tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] *
                 grad_in[d_2];
        gradients[d_1][q_point] = tmp * JxW[q_point];
      }
  }



  template <int dim,
            int fe_degree,
            int n_q_points_1d,
            int n_components_,
            typename Number>
  template <typename Functor>
  __device__ void
  FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
    apply_for_each_quad_point(const Functor &func)
  {
    func(this);

    __syncthreads();
  }
} // namespace CUDAWrappers

DEAL_II_NAMESPACE_CLOSE

#endif

#endif


